Overview


1. Load Required Libraries

library(here)
library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
library(scales)

# Set theme for all plots
theme_set(theme_bw(base_size = 12))

# Create output directory
output_dir <- params$output_dir
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

# Helper function to save figures in both PNG and PDF formats
save_figure <- function(plot, filename, width = 10, height = 8) {
  ggsave(file.path(output_dir, paste0(filename, ".png")), plot, 
         width = width, height = height, dpi = 300)
  ggsave(file.path(output_dir, paste0(filename, ".pdf")), plot, 
         width = width, height = height)
  cat("Saved:", filename, ".png/.pdf\n")
}

2. Load and Prepare Data

# Load preprocessed count data (created by 00_download_and_preprocess.R)
counts_file <- here("data", "processed", "mirna_counts_filtered.csv")
metadata_file <- here("data", "metadata", "sample_metadata.csv")

if (!file.exists(counts_file)) {
  stop("Preprocessed counts not found. Run 00_download_and_preprocess.R first.")
}

# Load data
count_data <- read_csv(counts_file, show_col_types = FALSE)
sample_metadata <- read_csv(metadata_file, show_col_types = FALSE)

# Convert to matrix with Gene_ID as rownames
raw_counts <- as.data.frame(count_data[, -1])  # remove Gene_ID column
rownames(raw_counts) <- count_data$Gene_ID

# Create sample mapping for compatibility with downstream code
sample_mapping <- sample_metadata %>%
  dplyr::select(Sample, Treatment_Group) %>%
  as.data.frame()
rownames(sample_mapping) <- sample_mapping$Sample
names(sample_mapping)[names(sample_mapping) == "Treatment_Group"] <- "comparison_7"

# Define sample groups for analysis
analysis_groups <- c("CF_Asp_Neg", "HC", "UNSTIM") 
samples_to_analyze <- sample_metadata %>%
  dplyr::filter(Treatment_Group %in% analysis_groups) %>%
  pull(Sample)

# Filter to samples in analysis
samples_to_analyze <- intersect(samples_to_analyze, colnames(raw_counts))

cat("Total samples in count data:", ncol(raw_counts), "\n")
## Total samples in count data: 78
cat("Total samples for analysis:", length(samples_to_analyze), "\n")
## Total samples for analysis: 46
cat("Groups included:", paste(analysis_groups, collapse = ", "), "\n")
## Groups included: CF_Asp_Neg, HC, UNSTIM

3. Calculate Fractional Abundances

# Extract counts for selected samples
raw_counts_subset <- raw_counts[, samples_to_analyze]

# Ensure numeric
raw_counts_subset <- as.data.frame(lapply(raw_counts_subset, as.numeric))
rownames(raw_counts_subset) <- rownames(raw_counts)

# Calculate total reads per sample
sample_totals <- colSums(raw_counts_subset, na.rm = TRUE)

# Calculate fractional abundance (each sample sums to 1)
frac_abundance <- sweep(raw_counts_subset, 2, sample_totals, FUN = "/")

# Verify all samples sum to 1
cat("\nVerification - all samples sum to 1.0:\n")
## 
## Verification - all samples sum to 1.0:
print(summary(colSums(frac_abundance)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cat("\nTotal miRNAs detected:", nrow(frac_abundance), "\n")
## 
## Total miRNAs detected: 2083
cat("Total reads across all samples:", format(sum(sample_totals), big.mark = ","), "\n")
## Total reads across all samples: 104,190,306

This replaces everything from the start of section 2 through the end of section 3. The key changes:

  1. Reads from data/processed/mirna_counts_filtered.csv instead of raw GEO Excel
  2. Removes the complex Excel parsing logic
  3. Uses the preprocessed data that already has correct sample IDs
  4. Keeps the same variable names (frac_abundance, sample_mapping, raw_counts) so downstream code works unchanged

After making this change, re-render the Rmd and check if the ANOVA p-values now match.

4. Identify Major miRNAs (≥1% Average Abundance)

# Calculate mean fractional abundance across all samples
mean_frac_abundance <- rowMeans(frac_abundance)

# Define threshold (1% = 0.01 fractional abundance)
threshold_pct <- 1
threshold_fraction <- threshold_pct / 100

# Identify miRNAs meeting the threshold
major_mirnas <- mean_frac_abundance[mean_frac_abundance >= threshold_fraction]
major_mirnas_sorted <- sort(major_mirnas, decreasing = TRUE)
major_mirna_names <- names(major_mirnas_sorted)

# Create detailed summary table
major_mirnas_table <- data.frame(
  Rank = 1:length(major_mirnas_sorted),
  miRNA = major_mirna_names,
  Mean_Fraction = major_mirnas_sorted,
  Percentage = major_mirnas_sorted * 100,
  Cumulative_Percentage = cumsum(major_mirnas_sorted) * 100
)

cat("\n=== Major miRNAs (≥1% of Total Reads) ===\n\n")
## 
## === Major miRNAs (≥1% of Total Reads) ===
print(major_mirnas_table, row.names = FALSE, digits = 4)
##  Rank        miRNA Mean_Fraction Percentage Cumulative_Percentage
##     1     miR-6126       0.33169     33.169                 33.17
##     2     miR-3197       0.09459      9.459                 42.63
##     3     miR-2861       0.05651      5.651                 48.28
##     4     miR-6131       0.04833      4.833                 53.11
##     5     miR-4417       0.04205      4.205                 57.32
##     6 miR-6780b-5p       0.02331      2.331                 59.65
##     7   miR-149-3p       0.02273      2.273                 61.92
##     8    miR-21-5p       0.01997      1.997                 63.92
##     9  miR-6803-5p       0.01771      1.771                 65.69
##    10  miR-1237-5p       0.01291      1.291                 66.98
##    11     miR-3687       0.01221      1.221                 68.20
##    12    miR-1273d       0.01157      1.157                 69.36
##    13     miR-3648       0.01012      1.012                 70.37
# Summary statistics
total_major_fraction <- sum(major_mirnas_sorted)
remaining_fraction <- 1 - total_major_fraction
n_major <- length(major_mirnas_sorted)
n_total <- nrow(frac_abundance)

cat("\n=== Summary Statistics ===\n")
## 
## === Summary Statistics ===
cat("Number of miRNAs with ≥1% abundance:", n_major, "\n")
## Number of miRNAs with ≥1% abundance: 13
cat("These", n_major, "miRNAs collectively represent:", 
    sprintf("%.2f%%", total_major_fraction * 100), "of all reads\n")
## These 13 miRNAs collectively represent: 70.37% of all reads
cat("Remaining", n_total - n_major, "miRNAs represent:", 
    sprintf("%.2f%%", remaining_fraction * 100), "of all reads\n")
## Remaining 2070 miRNAs represent: 29.63% of all reads
cat("Concentration ratio (Major / Remaining):", 
    sprintf("%.2f", total_major_fraction / remaining_fraction), "\n")
## Concentration ratio (Major / Remaining): 2.38
cat("\nMost abundant:", major_mirna_names[1], 
    "at", sprintf("%.2f%%", major_mirnas_sorted[1] * 100), "\n")
## 
## Most abundant: miR-6126 at 33.17%
cat("Least abundant (of major):", major_mirna_names[n_major], 
    "at", sprintf("%.2f%%", major_mirnas_sorted[n_major] * 100), "\n")
## Least abundant (of major): miR-3648 at 1.01%
# Save summary table
write.csv(major_mirnas_table, file.path(output_dir, "major_miRNAs_1pct_summary.csv"), 
          row.names = FALSE)

5. Visualizations of Major miRNAs

5.1 Bar Plot: Mean Fractional Abundance

# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.35)

# Prepare data for plotting
major_plot_data <- major_mirnas_table %>%
  mutate(miRNA = factor(miRNA, levels = rev(major_mirna_names))) # Reverse for top-down display

# Create bar plot
p1 <- ggplot(major_plot_data, aes(x = miRNA, y = Percentage)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red", size = 0.8) +
  geom_text(aes(label = sprintf("%.2f%%", Percentage)), 
            hjust = -0.1, size = 3.5) +
  coord_flip() +
  labs(
    title = "Major miRNAs (≥1% Average Abundance)",
    subtitle = sprintf("%d miRNAs collectively represent %.1f%% of all reads", 
                      n_major, total_major_fraction * 100),
    x = "miRNA",
    y = "Mean Fractional Abundance (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.y = element_text(face = "bold")
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))

print(p1)

save_figure(p1, "major_miRNAs_barplot", width = 10, height = fig_height)
## Saved: major_miRNAs_barplot .png/.pdf

5.2 Cumulative Abundance Plot

# Rank all miRNAs by abundance
all_ranked <- data.frame(
  miRNA = names(mean_frac_abundance),
  Mean_Fraction = mean_frac_abundance
) %>%
  arrange(desc(Mean_Fraction)) %>%
  mutate(
    Rank = 1:n(),
    Cumulative_Fraction = cumsum(Mean_Fraction),
    Cumulative_Percentage = Cumulative_Fraction * 100,
    Category = ifelse(Rank <= n_major, "Major (≥1%)", "Others")
  )

# Create cumulative plot
p2 <- ggplot(all_ranked, aes(x = Rank, y = Cumulative_Percentage)) +
  geom_line(size = 1.2, color = "darkblue") +
  geom_point(data = all_ranked[all_ranked$Rank <= n_major, ], 
             aes(x = Rank, y = Cumulative_Percentage),
             color = "red", size = 3) +
  geom_hline(yintercept = total_major_fraction * 100, 
             linetype = "dashed", color = "red", size = 0.8) +
  geom_vline(xintercept = n_major, linetype = "dashed", color = "gray50", size = 0.8) +
  annotate("text", x = n_major, y = 5, 
           label = paste("Top", n_major), color = "red", fontface = "bold", hjust = -0.1) +
  annotate("text", x = 200, y = total_major_fraction * 100 + 5,
           label = sprintf("%.1f%%", total_major_fraction * 100),
           color = "red", fontface = "bold") +
  labs(
    title = "Cumulative Fractional Abundance of miRNAs",
    subtitle = paste("Red points indicate major miRNAs (≥1% average abundance)"),
    x = "miRNA Rank (by abundance)",
    y = "Cumulative Percentage (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14)
  ) +
  scale_x_continuous(breaks = c(1, n_major, 50, 100, 200, 400, 600)) +
  scale_y_continuous(breaks = seq(0, 100, 10))

print(p2)

save_figure(p2, "cumulative_abundance_plot", width = 10, height = 6)
## Saved: cumulative_abundance_plot .png/.pdf

5.3 Stacked Bar Composition

# Create simplified composition data
composition_data <- data.frame(
  Sample = "All Samples Combined",
  Major = total_major_fraction * 100,
  Others = remaining_fraction * 100
) %>%
  pivot_longer(cols = c(Major, Others), 
               names_to = "Category", 
               values_to = "Percentage")

# Create stacked bar
p3 <- ggplot(composition_data, aes(x = Sample, y = Percentage, fill = Category)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = sprintf("%.1f%%", Percentage)),
            position = position_stack(vjust = 0.5),
            size = 6, fontface = "bold", color = "white") +
  scale_fill_manual(
    values = c("Major" = "steelblue", "Others" = "gray70"),
    labels = c("Major" = paste("Major miRNAs (≥1%, n =", n_major, ")"), 
               "Others" = "All Other miRNAs")
  ) +
  labs(
    title = "Compositional Dominance of Major miRNAs",
    y = "Percentage of Total Reads (%)",
    x = NULL,
    fill = "Category"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    legend.position = "bottom",
    legend.title = element_text(face = "bold"),
    axis.text.x = element_blank()
  ) +
  scale_y_continuous(breaks = seq(0, 100, 10))

print(p3)

save_figure(p3, "composition_stacked_bar", width = 10, height = 6)
## Saved: composition_stacked_bar .png/.pdf

6. Sample-Level Variation in Major miRNAs

6.1 Heatmap Across Samples

# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.4)

# Extract major miRNA data for all samples
major_data <- frac_abundance[major_mirna_names, ] * 100 # Convert to percentage

# Add treatment group information
sample_groups <- sample_mapping[colnames(major_data), "comparison_7"]

# Prepare data for heatmap
heatmap_data <- major_data %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  pivot_longer(cols = -miRNA, names_to = "Sample", values_to = "Percentage") %>%
  mutate(
    miRNA = factor(miRNA, levels = major_mirna_names),
    Group = sample_mapping[Sample, "comparison_7"]
  )

# Create heatmap
p4 <- ggplot(heatmap_data, aes(x = Sample, y = miRNA, fill = Percentage)) +
  geom_tile(color = "white", size = 0.5) +
  scale_fill_gradient2(
    low = "white", 
    mid = "lightblue", 
    high = "darkblue",
    midpoint = median(heatmap_data$Percentage),
    name = "Abundance\n(%)"
  ) +
  labs(
    title = "Fractional Abundance of Major miRNAs Across Samples",
    subtitle = "miRNAs with ≥1% average abundance",
    x = "Sample",
    y = "miRNA"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
    axis.text.y = element_text(face = "bold"),
    legend.position = "right"
  ) +
  facet_grid(. ~ Group, scales = "free_x", space = "free_x")

print(p4)

save_figure(p4, "major_miRNAs_heatmap_by_sample", width = 14, height = fig_height)
## Saved: major_miRNAs_heatmap_by_sample .png/.pdf

6.2 Boxplots by Treatment Group

# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.6)

# Create boxplot for each miRNA across treatment groups
p5 <- ggplot(heatmap_data, aes(x = Group, y = Percentage, fill = Group)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 1.5) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
  facet_wrap(~ miRNA, scales = "free_y", ncol = 5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Distribution of Major miRNA Abundances by Treatment Group",
    subtitle = "miRNAs with ≥1% average abundance",
    x = "Treatment Group",
    y = "Fractional Abundance (%)"
  ) +
  theme_bw(base_size = 10) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold"),
    legend.position = "bottom"
  )

print(p5)

save_figure(p5, "major_miRNAs_boxplot_by_group", width = 12, height = fig_height)
## Saved: major_miRNAs_boxplot_by_group .png/.pdf

6.3 Statistical Testing: Treatment Group Differences

Testing whether mean abundance of major miRNAs differs significantly between treatment groups using one-way ANOVA with post-hoc pairwise comparisons.

# Prepare data for statistical testing
test_data <- frac_abundance[major_mirna_names, ] %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
  mutate(
    treatment = sample_mapping[sample, "comparison_7"],
    percentage = fraction * 100  # Convert to percentage for easier interpretation
  )

# Function to perform ANOVA and Tukey HSD for each miRNA
perform_anova_tests <- function(mirna_name, data) {
  mirna_data <- data %>% filter(miRNA == mirna_name)
  
  # Perform one-way ANOVA
  anova_result <- aov(percentage ~ treatment, data = mirna_data)
  anova_summary <- summary(anova_result)
  
  # Extract p-value
  anova_pval <- anova_summary[[1]][["Pr(>F)"]][1]
  
  # Perform Tukey HSD post-hoc test if ANOVA is significant
  if(anova_pval < 0.05) {
    tukey_result <- TukeyHSD(anova_result)
    tukey_comparisons <- as.data.frame(tukey_result$treatment)
    tukey_comparisons$comparison <- rownames(tukey_comparisons)
    tukey_comparisons$miRNA <- mirna_name
  } else {
    tukey_comparisons <- NULL
  }
  
  # Calculate mean abundance per treatment
  treatment_means <- mirna_data %>%
    group_by(treatment) %>%
    summarise(mean_pct = mean(percentage), 
              sd_pct = sd(percentage),
              n = n(),
              .groups = 'drop')
  
  return(list(
    miRNA = mirna_name,
    anova_pval = anova_pval,
    tukey_comparisons = tukey_comparisons,
    treatment_means = treatment_means
  ))
}

# Run ANOVA tests for all major miRNAs
anova_results <- lapply(major_mirna_names, perform_anova_tests, data = test_data)

# Create summary table of ANOVA results
anova_summary_table <- data.frame(
  miRNA = sapply(anova_results, function(x) x$miRNA),
  ANOVA_P_Value = sapply(anova_results, function(x) x$anova_pval),
  Significant = sapply(anova_results, function(x) ifelse(x$anova_pval < 0.05, "Yes", "No"))
)

# Adjust for multiple testing using BH correction
anova_summary_table$BH_Adjusted_P <- p.adjust(anova_summary_table$ANOVA_P_Value, 
                                                       method = "BH")
anova_summary_table$Significant_After_Correction <- ifelse(
  anova_summary_table$BH_Adjusted_P < 0.05, "Yes", "No"
)

# Sort by p-value
anova_summary_table <- anova_summary_table %>% arrange(ANOVA_P_Value)

cat("\n=== ANOVA Results for Major miRNAs ===\n")
## 
## === ANOVA Results for Major miRNAs ===
cat("Testing for differences in mean abundance between treatment groups\n")
## Testing for differences in mean abundance between treatment groups
cat("(UNSTIM vs HC vs CF)\n\n")
## (UNSTIM vs HC vs CF)
print(anova_summary_table, row.names = FALSE, digits = 4)
##         miRNA ANOVA_P_Value Significant BH_Adjusted_P Significant_After_Correction
##      miR-6126     0.0003900         Yes       0.00294                          Yes
##   miR-6803-5p     0.0004523         Yes       0.00294                          Yes
##      miR-2861     0.0192520         Yes       0.08185                           No
##     miR-21-5p     0.0251845         Yes       0.08185                           No
##      miR-3648     0.0316774         Yes       0.08236                           No
##      miR-3197     0.0520371          No       0.09594                           No
##   miR-1237-5p     0.0628196          No       0.09594                           No
##    miR-149-3p     0.0662925          No       0.09594                           No
##  miR-6780b-5p     0.0664177          No       0.09594                           No
##      miR-3687     0.1252155          No       0.16114                           No
##     miR-1273d     0.1363509          No       0.16114                           No
##      miR-6131     0.2982792          No       0.32314                           No
##      miR-4417     0.3282540          No       0.32825                           No
# Count significant results
n_sig_raw <- sum(anova_summary_table$ANOVA_P_Value < 0.05)
n_sig_corrected <- sum(anova_summary_table$BH_Adjusted_P < 0.05)

cat("\n=== Summary ===\n")
## 
## === Summary ===
cat("Total miRNAs tested:", n_major, "\n")
## Total miRNAs tested: 13
cat("Significant at α = 0.05 (uncorrected):", n_sig_raw, "\n")
## Significant at α = 0.05 (uncorrected): 5
cat("Significant after BH correction:", n_sig_corrected, "\n")
## Significant after BH correction: 2
# Save ANOVA results
write.csv(anova_summary_table, file.path(output_dir, "major_miRNAs_ANOVA_results.csv"), 
          row.names = FALSE)

# Extract and display post-hoc comparisons for significant miRNAs
if(n_sig_raw > 0) {
  cat("\n=== Post-hoc Pairwise Comparisons (Tukey HSD) ===\n")
  cat("For miRNAs with significant ANOVA results (p < 0.05)\n\n")
  
  significant_mirnas <- anova_summary_table$miRNA[anova_summary_table$ANOVA_P_Value < 0.05]
  
  posthoc_results <- data.frame()
  for(mirna in significant_mirnas) {
    result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
    if(!is.null(result$tukey_comparisons)) {
      tukey_df <- result$tukey_comparisons %>%
        dplyr::select(miRNA, comparison, diff, `p adj`) %>%
        mutate(
          Significant = ifelse(`p adj` < 0.05, "Yes", "No")
        )
      posthoc_results <- rbind(posthoc_results, tukey_df)
    }
  }
  
  if(nrow(posthoc_results) > 0) {
    colnames(posthoc_results) <- c("miRNA", "Comparison", "Mean_Difference", 
                                     "Tukey_P_Value", "Significant")
    print(posthoc_results, row.names = FALSE, digits = 4)
    write.csv(posthoc_results, file.path(output_dir, "major_miRNAs_posthoc_comparisons.csv"), 
              row.names = FALSE)
  }
  
  # Create a summary of treatment means for significant miRNAs
  cat("\n=== Mean Abundances by Treatment (Significant miRNAs Only) ===\n\n")
  for(mirna in significant_mirnas) {
    result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
    cat(mirna, ":\n")
    print(result$treatment_means, row.names = FALSE, digits = 3)
    cat("\n")
  }
} else {
  cat("\nNo miRNAs showed significant differences between treatment groups at α = 0.05\n")
}
## 
## === Post-hoc Pairwise Comparisons (Tukey HSD) ===
## For miRNAs with significant ANOVA results (p < 0.05)
## 
##        miRNA        Comparison Mean_Difference Tukey_P_Value Significant
##     miR-6126     HC-CF_Asp_Neg          5.4137     0.5406546          No
##     miR-6126 UNSTIM-CF_Asp_Neg         20.6747     0.0003695         Yes
##     miR-6126         UNSTIM-HC         15.2609     0.0121355         Yes
##  miR-6803-5p     HC-CF_Asp_Neg         -1.1744     0.0503104          No
##  miR-6803-5p UNSTIM-CF_Asp_Neg         -2.0049     0.0002930         Yes
##  miR-6803-5p         UNSTIM-HC         -0.8304     0.2114420          No
##     miR-2861     HC-CF_Asp_Neg         -1.0164     0.8367911          No
##     miR-2861 UNSTIM-CF_Asp_Neg         -4.8332     0.0201599         Yes
##     miR-2861         UNSTIM-HC         -3.8169     0.0936520          No
##    miR-21-5p     HC-CF_Asp_Neg          1.7913     0.0656208          No
##    miR-21-5p UNSTIM-CF_Asp_Neg         -0.2461     0.9424930          No
##    miR-21-5p         UNSTIM-HC         -2.0374     0.0315782         Yes
##     miR-3648     HC-CF_Asp_Neg         -0.9553     0.1172778          No
##     miR-3648 UNSTIM-CF_Asp_Neg         -1.1766     0.0343769         Yes
##     miR-3648         UNSTIM-HC         -0.2213     0.8855847          No
## 
## === Mean Abundances by Treatment (Significant miRNAs Only) ===
## 
## miR-6126 :
## # A tibble: 3 × 4
##   treatment  mean_pct sd_pct     n
##   <chr>         <dbl>  <dbl> <int>
## 1 CF_Asp_Neg     24.3  14.6     16
## 2 HC             29.7  17.9     14
## 3 UNSTIM         45.0   7.79    16
## 
## miR-6803-5p :
## # A tibble: 3 × 4
##   treatment  mean_pct sd_pct     n
##   <chr>         <dbl>  <dbl> <int>
## 1 CF_Asp_Neg    2.83   2.07     16
## 2 HC            1.65   0.851    14
## 3 UNSTIM        0.821  0.334    16
## 
## miR-2861 :
## # A tibble: 3 × 4
##   treatment  mean_pct sd_pct     n
##   <chr>         <dbl>  <dbl> <int>
## 1 CF_Asp_Neg     7.64   5.57    16
## 2 HC             6.62   6.40    14
## 3 UNSTIM         2.81   1.28    16
## 
## miR-21-5p :
## # A tibble: 3 × 4
##   treatment  mean_pct sd_pct     n
##   <chr>         <dbl>  <dbl> <int>
## 1 CF_Asp_Neg     1.54  1.71     16
## 2 HC             3.33  3.34     14
## 3 UNSTIM         1.29  0.570    16
## 
## miR-3648 :
## # A tibble: 3 × 4
##   treatment  mean_pct sd_pct     n
##   <chr>         <dbl>  <dbl> <int>
## 1 CF_Asp_Neg    1.71   2.12     16
## 2 HC            0.756  0.474    14
## 3 UNSTIM        0.535  0.257    16

6.4 Visualization of Significant Differences

if(n_sig_raw > 0) {
  # Get list of significant miRNAs
  significant_mirnas <- anova_summary_table$miRNA[anova_summary_table$ANOVA_P_Value < 0.05]
  
  # Filter data for significant miRNAs only
  sig_plot_data <- test_data %>%
    filter(miRNA %in% significant_mirnas) %>%
    mutate(
      miRNA = factor(miRNA, levels = significant_mirnas),
      treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF_Asp_Neg"),
                         labels = c("UNSTIM", "HC", "CF"))
    )
  
  # Prepare annotation data from Tukey HSD results
  annotation_data <- data.frame()
  
  for(mirna in significant_mirnas) {
    result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
    
    if(!is.null(result$tukey_comparisons)) {
      tukey_df <- result$tukey_comparisons
      max_val <- max(sig_plot_data$percentage[sig_plot_data$miRNA == mirna])
      comparisons <- tukey_df$comparison
      p_values <- tukey_df$`p adj`
      
      for(i in 1:length(comparisons)) {
        if(p_values[i] < 0.05) {
          comp <- comparisons[i]
          groups <- strsplit(comp, "-")[[1]]
          groups <- recode(groups, "CF_Asp_Neg" = "CF")
          
          if(p_values[i] < 0.001) {
            sig_label <- "***"
          } else if(p_values[i] < 0.01) {
            sig_label <- "**"
          } else {
            sig_label <- "*"
          }
          
          annotation_data <- rbind(annotation_data, data.frame(
            miRNA = mirna,
            comparison = comp,
            group1 = groups[1],
            group2 = groups[2],
            p_value = p_values[i],
            label = sig_label,
            y_position = max_val,
            stringsAsFactors = FALSE
          ))
        }
      }
    }
  }
  
  # Create base plot
  p_sig <- ggplot(sig_plot_data, aes(x = treatment, y = percentage, fill = treatment)) +
    geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 1.5) +
    geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
    stat_summary(fun = mean, geom = "point", shape = 23, size = 3, 
                 fill = "red", color = "darkred") +
    facet_wrap(~ miRNA, scales = "free_y", ncol = 4) +
    scale_fill_brewer(palette = "Set2") +
    labs(
      title = "Major miRNAs with Significant Treatment Differences",
      subtitle = "Red diamonds = group means; * p<0.05, ** p<0.01, *** p<0.001 (Tukey HSD)",
      x = "Treatment Group",
      y = "Fractional Abundance (%)",
      fill = "Treatment"
    ) +
    theme_bw(base_size = 11) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10),
      axis.text.x = element_text(angle = 45, hjust = 1),
      strip.text = element_text(face = "bold", size = 10),
      strip.background = element_rect(fill = "grey95"),
      legend.position = "bottom"
    )
  
  # Add significance annotations if any exist
  if(nrow(annotation_data) > 0) {
    annotation_data_positioned <- annotation_data %>%
      group_by(miRNA) %>%
      mutate(
        max_y = max(sig_plot_data$percentage[sig_plot_data$miRNA == miRNA]),
        bracket_y = max_y + (max_y * 0.05 * row_number())
      ) %>%
      ungroup()
    
    treatment_positions <- c("UNSTIM" = 1, "HC" = 2, "CF" = 3)
    
    annotation_data_positioned$x1 <- treatment_positions[annotation_data_positioned$group1]
    annotation_data_positioned$x2 <- treatment_positions[annotation_data_positioned$group2]
    annotation_data_positioned$x_mid <- (annotation_data_positioned$x1 + 
                                         annotation_data_positioned$x2) / 2
    
    p_sig <- p_sig +
      geom_segment(data = annotation_data_positioned,
                   aes(x = x1, xend = x1, y = bracket_y, yend = bracket_y * 1.02),
                   inherit.aes = FALSE, color = "black") +
      geom_segment(data = annotation_data_positioned,
                   aes(x = x1, xend = x2, y = bracket_y * 1.02, yend = bracket_y * 1.02),
                   inherit.aes = FALSE, color = "black") +
      geom_segment(data = annotation_data_positioned,
                   aes(x = x2, xend = x2, y = bracket_y * 1.02, yend = bracket_y),
                   inherit.aes = FALSE, color = "black") +
      geom_text(data = annotation_data_positioned,
                aes(x = x_mid, y = bracket_y * 1.04, label = label),
                inherit.aes = FALSE, size = 4, fontface = "bold")
  }
  
  print(p_sig)
  save_figure(p_sig, "major_miRNAs_significant_differences", 
              width = 12, height = max(6, n_sig_raw*1.5))
  cat("\nSignificance annotations show Tukey HSD post-hoc test results:\n")
  cat("  * p < 0.05\n")
  cat(" ** p < 0.01\n")
  cat("*** p < 0.001\n")
} else {
  cat("No significant differences detected; skipping visualization.\n")
}

## Saved: major_miRNAs_significant_differences .png/.pdf
## 
## Significance annotations show Tukey HSD post-hoc test results:
##   * p < 0.05
##  ** p < 0.01
## *** p < 0.001

6.5 Summary Table: Mean Abundance by Treatment

# Create comprehensive summary table
summary_by_treatment <- test_data %>%
  group_by(miRNA, treatment) %>%
  summarise(
    Mean_Percentage = mean(percentage),
    SD = sd(percentage),
    N = n(),
    SE = sd(percentage) / sqrt(n()),
    .groups = 'drop'
  ) %>%
  pivot_wider(
    names_from = treatment,
    values_from = c(Mean_Percentage, SD, SE),
    names_glue = "{treatment}_{.value}"
  ) %>%
  left_join(
    anova_summary_table %>% dplyr::select(miRNA, ANOVA_P_Value, BH_Adjusted_P),
    by = "miRNA"
  ) %>%
  arrange(ANOVA_P_Value)

cat("\n=== Mean Abundance by Treatment Group ===\n")
## 
## === Mean Abundance by Treatment Group ===
print(summary_by_treatment, digits = 3)
## # A tibble: 26 × 13
##    miRNA      N CF_Asp_Neg_Mean_Perc…¹ HC_Mean_Percentage UNSTIM_Mean_Percentage CF_Asp_Neg_SD  HC_SD UNSTIM_SD CF_Asp_Neg_SE
##    <chr>  <int>                  <dbl>              <dbl>                  <dbl>         <dbl>  <dbl>     <dbl>         <dbl>
##  1 miR-6…    16                  24.3              NA                     45.0           14.6  NA         7.79          3.66 
##  2 miR-6…    14                  NA                29.7                   NA             NA    17.9      NA            NA    
##  3 miR-6…    16                   2.83             NA                      0.821          2.07 NA         0.334         0.517
##  4 miR-6…    14                  NA                 1.65                  NA             NA     0.851    NA            NA    
##  5 miR-2…    16                   7.64             NA                      2.81           5.57 NA         1.28          1.39 
##  6 miR-2…    14                  NA                 6.62                  NA             NA     6.40     NA            NA    
##  7 miR-2…    16                   1.54             NA                      1.29           1.71 NA         0.570         0.427
##  8 miR-2…    14                  NA                 3.33                  NA             NA     3.34     NA            NA    
##  9 miR-3…    16                   1.71             NA                      0.535          2.12 NA         0.257         0.529
## 10 miR-3…    14                  NA                 0.756                 NA             NA     0.474    NA            NA    
## # ℹ 16 more rows
## # ℹ abbreviated name: ¹​CF_Asp_Neg_Mean_Percentage
## # ℹ 4 more variables: HC_SE <dbl>, UNSTIM_SE <dbl>, ANOVA_P_Value <dbl>, BH_Adjusted_P <dbl>
write.csv(summary_by_treatment, file.path(output_dir, "major_miRNAs_treatment_summary.csv"), 
          row.names = FALSE)
cat("\nComprehensive summary saved to: major_miRNAs_treatment_summary.csv\n")
## 
## Comprehensive summary saved to: major_miRNAs_treatment_summary.csv

6.6 Stacked Bar Plot by Sample and Treatment

This visualization shows the compositional breakdown for each individual sample, allowing you to see both within-group consistency and between-group differences.

# Prepare data for stacked bar chart
stack_data <- frac_abundance %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
  mutate(
    miRNA_category = ifelse(miRNA %in% major_mirna_names, miRNA, "Other"),
    treatment = sample_mapping[sample, "comparison_7"],
    # Relabel CF_Asp_Neg to CF
    treatment = recode(treatment, "CF_Asp_Neg" = "CF")
  ) %>%
  group_by(sample, treatment, miRNA_category) %>%
  summarise(total_fraction = sum(fraction), .groups = 'drop')

# Set factor levels: most abundant at bottom, "Other" at top
stack_data <- stack_data %>%
  mutate(
    miRNA_category = factor(miRNA_category, 
                           levels = rev(c(major_mirna_names, "Other"))),
    treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF"))
  )

# Create color palette: colors for major miRNAs + grey for "Other"
n_colors <- min(n_major, 12)
if (n_major <= 12) {
  stack_colors <- rev(c(brewer.pal(n_major, "Paired"), "grey80"))
} else {
  base_colors <- brewer.pal(12, "Paired")
  stack_colors <- rev(c(rep(base_colors, ceiling(n_major/12))[1:n_major], "grey80"))
}
names(stack_colors) <- rev(c(major_mirna_names, "Other"))

# Create stacked bar plot
p6 <- ggplot(stack_data, aes(x = sample, y = total_fraction, fill = miRNA_category)) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_manual(values = stack_colors, name = "miRNA") +
  facet_wrap(~ treatment, scales = "free_x", nrow = 1) +
  labs(
    title = "miRNA Composition by Sample and Treatment",
    subtitle = sprintf("Major miRNAs (≥1%%, n=%d) shown individually (~%.1f%% of reads), remaining as 'Other'",
                     n_major, total_major_fraction * 100),
    x = "",
    y = "Fractional Abundance"
  ) +
  theme_classic(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 11),
    strip.text = element_text(size = 11, face = "bold"),
    strip.background = element_rect(fill = "grey95", color = "grey50"),
    legend.position = "bottom",
    legend.text = element_text(size = 9, family = "mono"),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE, reverse = TRUE))

print(p6)

save_figure(p6, "major_miRNAs_stacked_by_sample", width = 14, height = 8)
## Saved: major_miRNAs_stacked_by_sample .png/.pdf

7. Compositional Differences Between Treatment Groups

The three treatment groups show significantly different cumulative distribution functions, indicating that exposure conditions alter the compositional structure of miRNA cargo in extracellular vesicles.

7.1 Statistical Testing: Kolmogorov-Smirnov Tests

# Prepare data for statistical testing
abundance_by_treatment <- frac_abundance %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
  mutate(treatment = sample_mapping[sample, "comparison_7"]) %>%
  group_by(treatment, miRNA) %>%
  summarise(mean_fraction = mean(fraction, na.rm = TRUE), .groups = 'drop') %>%
  group_by(treatment) %>%
  arrange(desc(mean_fraction)) %>%
  mutate(
    rank = row_number(),
    cumsum_fraction = cumsum(mean_fraction),
    total_fraction = sum(mean_fraction),
    cumsum_pct = cumsum_fraction / total_fraction * 100,
    log10_fraction = log10(mean_fraction + 1e-8)  # Add small constant to avoid log(0)
  ) %>%
  ungroup()

# Perform pairwise Kolmogorov-Smirnov tests
treatments <- c("UNSTIM", "HC", "CF_Asp_Neg")
ks_results <- data.frame()

for(i in 1:(length(treatments)-1)) {
  for(j in (i+1):length(treatments)) {
    treat1 <- treatments[i]
    treat2 <- treatments[j]
    
    data1 <- abundance_by_treatment %>% 
      filter(treatment == treat1) %>% 
      pull(mean_fraction)
    data2 <- abundance_by_treatment %>% 
      filter(treatment == treat2) %>% 
      pull(mean_fraction)
    
    ks_test <- ks.test(data1, data2)
    
    ks_results <- rbind(ks_results, data.frame(
      Comparison = paste(treat1, "vs", treat2),
      KS_Statistic = round(as.numeric(ks_test$statistic), 4),
      P_Value = formatC(ks_test$p.value, format = "e", digits = 3),
      Significant = ifelse(ks_test$p.value < 0.05, "Yes", "No")
    ))
  }
}

cat("\n=== Kolmogorov-Smirnov Test Results ===\n")
## 
## === Kolmogorov-Smirnov Test Results ===
cat("Tests whether miRNA abundance distributions differ between treatment groups\n\n")
## Tests whether miRNA abundance distributions differ between treatment groups
print(ks_results, row.names = FALSE)
##            Comparison KS_Statistic    P_Value Significant
##          UNSTIM vs HC       0.5655 9.468e-290         Yes
##  UNSTIM vs CF_Asp_Neg       0.4710 4.507e-201         Yes
##      HC vs CF_Asp_Neg       0.4364 1.061e-172         Yes
# Save results
write.csv(ks_results, file.path(output_dir, "KS_test_results.csv"), row.names = FALSE)

cat("\n\nInterpretation:\n")
## 
## 
## Interpretation:
cat("All pairwise comparisons show significant distributional differences (p < 0.05).\n")
## All pairwise comparisons show significant distributional differences (p < 0.05).
cat("This indicates that exposure to CF BAL, HC BAL, or no exposure (UNSTIM)\n")
## This indicates that exposure to CF BAL, HC BAL, or no exposure (UNSTIM)
cat("results in fundamentally different compositional structures of miRNA cargo.\n")
## results in fundamentally different compositional structures of miRNA cargo.

7.2 Density Plots: Visualizing Distributional Shapes

These plots reveal that HC (healthy control BAL) has a much broader distributional tail compared to UNSTIM and CF_Asp_Neg, suggesting greater compositional diversity or evenness. CF_Asp_Neg appears compositionally more similar to UNSTIM than to HC.

# Set factor levels for desired order: UNSTIM (top), HC (middle), CF_Asp_Neg (bottom)
abundance_by_treatment <- abundance_by_treatment %>%
  mutate(
    treatment = recode(treatment, "CF_Asp_Neg" = "CF"),
    treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF"))
  )

# Create density plots
p7 <- ggplot(abundance_by_treatment, aes(x = log10_fraction, fill = treatment)) +
  geom_density(alpha = 0.6, size = 1) +
  facet_wrap(~ treatment, ncol = 1) +
  labs(
    title = "Distribution Density of miRNA Abundances by Treatment",
    subtitle = "HC shows broader distributional tail; CF_Asp_Neg more similar to UNSTIM",
    x = "Log10(Fractional Abundance)",
    y = "Density",
    fill = "Treatment"
  ) +
  theme_classic(base_size = 12) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    strip.text = element_text(size = 12, face = "bold"),
    strip.background = element_rect(fill = "grey95", color = "grey50")
  ) +
  scale_fill_brewer(palette = "Set2")

print(p7)

save_figure(p7, "abundance_density_plots", width = 8, height = 10)
## Saved: abundance_density_plots .png/.pdf
cat("\n\nKey observations from density plots:\n")
## 
## 
## Key observations from density plots:
cat("- HC (Healthy Control): Broader, more extended tail → greater diversity/evenness\n")
## - HC (Healthy Control): Broader, more extended tail → greater diversity/evenness
cat("- UNSTIM: More concentrated distribution with shorter tail\n")
## - UNSTIM: More concentrated distribution with shorter tail
cat("- CF_Asp_Neg: Similar shape to UNSTIM, suggesting CF exposure produces\n")
## - CF_Asp_Neg: Similar shape to UNSTIM, suggesting CF exposure produces
cat("  a compositional structure more like unstimulated cells than HC-stimulated\n")
##   a compositional structure more like unstimulated cells than HC-stimulated

7.3 Empirical Cumulative Distribution Functions (ECDF)

This plot directly visualizes what the Kolmogorov-Smirnov test compares. Larger vertical gaps between curves indicate greater distributional differences.

# Create ECDF plot
p8 <- ggplot(abundance_by_treatment, aes(x = log10_fraction, color = treatment)) +
  stat_ecdf(size = 1.2, alpha = 0.8) +
  labs(
    title = "Empirical Cumulative Distribution Functions",
    subtitle = "Direct visualization of Kolmogorov-Smirnov test comparisons\nLarger gaps between curves = more significant differences",
    x = "Log10(Fractional Abundance)",
    y = "Cumulative Probability",
    color = "Treatment"
  ) +
  theme_classic(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) +
  scale_color_brewer(palette = "Set2") +
  scale_y_continuous(labels = percent_format())

print(p8)

save_figure(p8, "ECDF_by_treatment", width = 10, height = 7)
## Saved: ECDF_by_treatment .png/.pdf

7.4 Cumulative Abundance Curves

Shows how quickly each treatment reaches abundance thresholds. Steeper curves indicate more concentrated distributions where fewer miRNAs dominate.

# Create cumulative abundance curves
p9 <- ggplot(abundance_by_treatment, aes(x = rank, y = cumsum_pct, color = treatment)) +
  geom_line(size = 1.2, alpha = 0.8) +
  geom_hline(yintercept = c(50, 80, 90, 95), linetype = "dashed", alpha = 0.5) +
  scale_x_log10() +
  labs(
    title = "Cumulative Abundance Curves by Treatment",
    subtitle = "How many miRNAs needed to reach X% of total abundance",
    x = "Number of Top miRNAs (log scale)",
    y = "Cumulative % of Total Abundance",
    color = "Treatment"
  ) +
  theme_classic(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 14)
  ) +
  scale_color_brewer(palette = "Set2") +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  annotate("text", x = 1000, y = c(50, 80, 90, 95), 
           label = c("50%", "80%", "90%", "95%"), 
           hjust = 1, alpha = 0.7, size = 3.5)

print(p9)

save_figure(p9, "cumulative_abundance_by_treatment", width = 10, height = 7)
## Saved: cumulative_abundance_by_treatment .png/.pdf

8. Diversity Analysis

Ecological diversity metrics provide a complementary perspective on compositional differences. While Section 7 examined distributional shapes, this section quantifies how evenly miRNA reads are distributed across species using established diversity indices.

8.1 Calculate Diversity Metrics

# Function to calculate Gini coefficient (0 = perfect equality, 1 = complete inequality)
gini_coef <- function(x) {
  x <- sort(x[x > 0])
  n <- length(x)
  if(n == 0) return(NA)
  2 * sum((1:n) * x) / (n * sum(x)) - (n + 1) / n
}

# Calculate diversity metrics per sample
# Shannon entropy: H = -Σ(p * ln(p))
shannon_per_sample <- apply(frac_abundance, 2, function(x) {
  x <- x[x > 0]  # Remove zeros
  -sum(x * log(x))
})

# Simpson's diversity (1 - D): probability that two randomly selected reads are different miRNAs
simpson_per_sample <- apply(frac_abundance, 2, function(x) {
  1 - sum(x^2)
})

# Effective number of species (Hill number q=1): exponential of Shannon entropy
# Represents the "equivalent" number of equally-abundant miRNAs
effective_species <- exp(shannon_per_sample)

# Richness: number of detected miRNAs per sample
richness_per_sample <- apply(frac_abundance, 2, function(x) sum(x > 0))

# Pielou's evenness: J = H / H_max = H / ln(S)
# Ranges 0-1; higher = more even distribution
pielou_evenness <- shannon_per_sample / log(richness_per_sample)

# Gini coefficient per sample
gini_per_sample <- apply(frac_abundance, 2, gini_coef)

# Create comprehensive diversity data frame
diversity_df <- data.frame(
  Sample = names(shannon_per_sample),
  Shannon = shannon_per_sample,
  Simpson = simpson_per_sample,
  Effective_Species = effective_species,
  Richness = richness_per_sample,
  Pielou_Evenness = pielou_evenness,
  Gini = gini_per_sample,
  Treatment = sample_mapping[names(shannon_per_sample), "comparison_7"],
  row.names = NULL
)

# Set treatment factor levels
diversity_df$Treatment <- factor(diversity_df$Treatment, 
                                  levels = c("UNSTIM", "HC", "CF_Asp_Neg"))

# Display individual sample values
cat("\n=== Diversity Metrics by Sample ===\n\n")
## 
## === Diversity Metrics by Sample ===
print(diversity_df %>% arrange(Treatment, Sample), digits = 3)
##       Sample Shannon Simpson Effective_Species Richness Pielou_Evenness  Gini  Treatment
## 1  sample_10    2.58   0.738              13.2     1645           0.349 0.978     UNSTIM
## 2  sample_11    3.14   0.783              23.0     2083           0.410 0.927     UNSTIM
## 3  sample_17    2.40   0.686              11.0     1568           0.326 0.979     UNSTIM
## 4  sample_19    2.52   0.712              12.4     1657           0.340 0.979     UNSTIM
## 5  sample_20    2.75   0.747              15.7     1944           0.363 0.977     UNSTIM
## 6  sample_22    2.79   0.723              16.3     2082           0.365 0.960     UNSTIM
## 7  sample_37    2.85   0.764              17.3     2082           0.373 0.968     UNSTIM
## 8  sample_49    4.04   0.907              56.6     2071           0.528 0.951     UNSTIM
## 9   sample_5    3.49   0.837              32.9     1742           0.468 0.958     UNSTIM
## 10 sample_54    2.67   0.727              14.4     2080           0.349 0.972     UNSTIM
## 11 sample_55    4.88   0.920             131.0     2082           0.638 0.768     UNSTIM
## 12 sample_57    2.99   0.792              20.0     2076           0.392 0.972     UNSTIM
## 13 sample_58    2.92   0.766              18.6     2082           0.383 0.965     UNSTIM
## 14 sample_62    2.66   0.744              14.3     1644           0.359 0.976     UNSTIM
## 15 sample_68    2.75   0.747              15.6     2062           0.360 0.975     UNSTIM
## 16 sample_69    2.92   0.797              18.5     1601           0.395 0.972     UNSTIM
## 17 sample_14    3.89   0.920              49.1     1876           0.517 0.963         HC
## 18 sample_18    2.52   0.723              12.4     1692           0.339 0.980         HC
## 19  sample_2    3.72   0.921              41.2     1800           0.496 0.966         HC
## 20 sample_24    3.28   0.829              26.6     1936           0.434 0.971         HC
## 21 sample_26    2.61   0.741              13.6     1758           0.349 0.981         HC
## 22 sample_29    2.74   0.759              15.4     1855           0.363 0.979         HC
## 23  sample_3    4.02   0.934              55.7     1880           0.533 0.959         HC
## 24 sample_33    3.14   0.836              23.1     1701           0.422 0.975         HC
## 25 sample_52    4.25   0.963              70.1     1688           0.572 0.955         HC
## 26  sample_6    3.86   0.928              47.4     1958           0.509 0.967         HC
## 27 sample_63    2.91   0.773              18.3     2082           0.381 0.966         HC
## 28 sample_71    3.69   0.871              40.2     1812           0.492 0.958         HC
## 29 sample_75    3.10   0.811              22.1     1990           0.408 0.975         HC
## 30  sample_9    4.13   0.951              62.3     1844           0.549 0.959         HC
## 31  sample_1    3.05   0.821              21.2     1540           0.416 0.975 CF_Asp_Neg
## 32 sample_16    3.46   0.909              31.7     1899           0.458 0.976 CF_Asp_Neg
## 33 sample_27    3.12   0.828              22.7     2080           0.408 0.972 CF_Asp_Neg
## 34 sample_30    4.96   0.976             142.2     2083           0.649 0.875 CF_Asp_Neg
## 35 sample_31    2.79   0.828              16.3     1830           0.372 0.982 CF_Asp_Neg
## 36 sample_39    2.96   0.841              19.4     1992           0.390 0.979 CF_Asp_Neg
## 37 sample_42    3.00   0.853              20.1     1825           0.399 0.979 CF_Asp_Neg
## 38 sample_45    2.91   0.832              18.4     1756           0.390 0.975 CF_Asp_Neg
## 39 sample_47    3.47   0.886              32.1     2076           0.454 0.966 CF_Asp_Neg
## 40 sample_50    4.05   0.941              57.5     2083           0.530 0.934 CF_Asp_Neg
## 41 sample_51    4.49   0.964              89.3     2083           0.588 0.941 CF_Asp_Neg
## 42 sample_64    3.03   0.847              20.6     2047           0.397 0.981 CF_Asp_Neg
## 43 sample_66    3.12   0.884              22.7     1503           0.427 0.978 CF_Asp_Neg
## 44 sample_70    2.73   0.783              15.3     1715           0.366 0.983 CF_Asp_Neg
## 45 sample_73    3.31   0.894              27.4     1614           0.448 0.974 CF_Asp_Neg
## 46 sample_76    2.67   0.797              14.4     1668           0.359 0.979 CF_Asp_Neg
# Save sample-level diversity data
write.csv(diversity_df, file.path(output_dir, "diversity_metrics_by_sample.csv"), 
          row.names = FALSE)

8.2 Summary Statistics by Treatment

# Summary statistics by treatment
diversity_summary <- diversity_df %>%
  group_by(Treatment) %>%
  summarise(
    n = n(),
    Shannon_Mean = mean(Shannon),
    Shannon_SD = sd(Shannon),
    Simpson_Mean = mean(Simpson),
    Simpson_SD = sd(Simpson),
    Pielou_Mean = mean(Pielou_Evenness),
    Pielou_SD = sd(Pielou_Evenness),
    Effective_Species_Mean = mean(Effective_Species),
    Effective_Species_SD = sd(Effective_Species),
    Gini_Mean = mean(Gini),
    Gini_SD = sd(Gini),
    .groups = 'drop'
  )

cat("\n=== Diversity Metrics Summary by Treatment ===\n\n")
## 
## === Diversity Metrics Summary by Treatment ===
print(diversity_summary, digits = 3)
## # A tibble: 3 × 12
##   Treatment      n Shannon_Mean Shannon_SD Simpson_Mean Simpson_SD Pielou_Mean Pielou_SD Effective_Species_Mean
##   <fct>      <int>        <dbl>      <dbl>        <dbl>      <dbl>       <dbl>     <dbl>                  <dbl>
## 1 UNSTIM        16         3.02      0.635        0.774     0.0654       0.400    0.0813                   26.9
## 2 HC            14         3.42      0.591        0.854     0.0836       0.455    0.0789                   35.5
## 3 CF_Asp_Neg    16         3.32      0.649        0.868     0.0575       0.441    0.0820                   35.7
## # ℹ 3 more variables: Effective_Species_SD <dbl>, Gini_Mean <dbl>, Gini_SD <dbl>
write.csv(diversity_summary, file.path(output_dir, "diversity_summary_by_treatment.csv"), 
          row.names = FALSE)

cat("\n\nInterpretation of diversity metrics:\n")
## 
## 
## Interpretation of diversity metrics:
cat("- Shannon (H'): Higher values = more diverse/even distribution\n")
## - Shannon (H'): Higher values = more diverse/even distribution
cat("- Simpson (1-D): Probability two random reads are different miRNAs (higher = more diverse)\n")
## - Simpson (1-D): Probability two random reads are different miRNAs (higher = more diverse)
cat("- Pielou's J: Evenness relative to maximum possible (0-1 scale)\n")
## - Pielou's J: Evenness relative to maximum possible (0-1 scale)
cat("- Effective Species: Equivalent number of equally-abundant miRNAs\n")
## - Effective Species: Equivalent number of equally-abundant miRNAs
cat("- Gini: Inequality coefficient (higher = more dominated by few miRNAs)\n")
## - Gini: Inequality coefficient (higher = more dominated by few miRNAs)

8.3 Statistical Testing: ANOVA for Diversity Indices

# ANOVA for Shannon diversity
cat("\n=== ANOVA: Shannon Diversity by Treatment ===\n")
## 
## === ANOVA: Shannon Diversity by Treatment ===
shannon_aov <- aov(Shannon ~ Treatment, data = diversity_df)
print(summary(shannon_aov))
##             Df Sum Sq Mean Sq F value Pr(>F)
## Treatment    2  1.311  0.6553   1.668  0.201
## Residuals   43 16.895  0.3929
shannon_pval <- summary(shannon_aov)[[1]][["Pr(>F)"]][1]

if(shannon_pval < 0.05) {
  cat("\n=== Tukey HSD Post-hoc: Shannon Diversity ===\n")
  shannon_tukey <- TukeyHSD(shannon_aov)
  print(shannon_tukey)
}

# ANOVA for Simpson diversity
cat("\n=== ANOVA: Simpson Diversity by Treatment ===\n")
## 
## === ANOVA: Simpson Diversity by Treatment ===
simpson_aov <- aov(Simpson ~ Treatment, data = diversity_df)
print(summary(simpson_aov))
##             Df  Sum Sq Mean Sq F value   Pr(>F)    
## Treatment    2 0.08046 0.04023   8.464 0.000795 ***
## Residuals   43 0.20439 0.00475                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
simpson_pval <- summary(simpson_aov)[[1]][["Pr(>F)"]][1]

if(simpson_pval < 0.05) {
  cat("\n=== Tukey HSD Post-hoc: Simpson Diversity ===\n")
  simpson_tukey <- TukeyHSD(simpson_aov)
  print(simpson_tukey)
}
## 
## === Tukey HSD Post-hoc: Simpson Diversity ===
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Simpson ~ Treatment, data = diversity_df)
## 
## $Treatment
##                         diff         lwr        upr     p adj
## HC-UNSTIM         0.07993892  0.01869232 0.14118552 0.0077873
## CF_Asp_Neg-UNSTIM 0.09332780  0.03415796 0.15249764 0.0011799
## CF_Asp_Neg-HC     0.01338888 -0.04785773 0.07463548 0.8567601
# ANOVA for Pielou's evenness
cat("\n=== ANOVA: Pielou's Evenness by Treatment ===\n")
## 
## === ANOVA: Pielou's Evenness by Treatment ===
pielou_aov <- aov(Pielou_Evenness ~ Treatment, data = diversity_df)
print(summary(pielou_aov))
##             Df  Sum Sq  Mean Sq F value Pr(>F)
## Treatment    2 0.02471 0.012357   1.891  0.163
## Residuals   43 0.28097 0.006534
pielou_pval <- summary(pielou_aov)[[1]][["Pr(>F)"]][1]

if(pielou_pval < 0.05) {
  cat("\n=== Tukey HSD Post-hoc: Pielou's Evenness ===\n")
  pielou_tukey <- TukeyHSD(pielou_aov)
  print(pielou_tukey)
}

# ANOVA for Gini coefficient
cat("\n=== ANOVA: Gini Coefficient by Treatment ===\n")
## 
## === ANOVA: Gini Coefficient by Treatment ===
gini_aov <- aov(Gini ~ Treatment, data = diversity_df)
print(summary(gini_aov))
##             Df  Sum Sq   Mean Sq F value Pr(>F)
## Treatment    2 0.00154 0.0007703   0.629  0.538
## Residuals   43 0.05263 0.0012239
gini_pval <- summary(gini_aov)[[1]][["Pr(>F)"]][1]

if(gini_pval < 0.05) {
  cat("\n=== Tukey HSD Post-hoc: Gini Coefficient ===\n")
  gini_tukey <- TukeyHSD(gini_aov)
  print(gini_tukey)
}

# Summary of significance
cat("\n=== Summary of Diversity ANOVA Results ===\n")
## 
## === Summary of Diversity ANOVA Results ===
diversity_anova_summary <- data.frame(
  Index = c("Shannon", "Simpson", "Pielou_Evenness", "Gini"),
  ANOVA_P_Value = c(shannon_pval, simpson_pval, pielou_pval, gini_pval),
  Significant = c(
    ifelse(shannon_pval < 0.05, "Yes", "No"),
    ifelse(simpson_pval < 0.05, "Yes", "No"),
    ifelse(pielou_pval < 0.05, "Yes", "No"),
    ifelse(gini_pval < 0.05, "Yes", "No")
  )
)
print(diversity_anova_summary, row.names = FALSE)
##            Index ANOVA_P_Value Significant
##          Shannon   0.200650358          No
##          Simpson   0.000795274         Yes
##  Pielou_Evenness   0.163253636          No
##             Gini   0.537732140          No
write.csv(diversity_anova_summary, file.path(output_dir, "diversity_ANOVA_results.csv"), 
          row.names = FALSE)

8.4 Visualization: Shannon Diversity

# Create Shannon diversity boxplot
p_shannon <- ggplot(diversity_df, aes(x = Treatment, y = Shannon, fill = Treatment)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
  geom_jitter(width = 0.15, alpha = 0.7, size = 3) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 4, 
               fill = "red", color = "darkred") +
  scale_fill_brewer(palette = "Set2", labels = c("CF_Asp_Neg" = "CF")) +
  scale_x_discrete(labels = c("CF_Asp_Neg" = "CF")) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Shannon Diversity of miRNA Cargo by Treatment",
    subtitle = "Higher values indicate more even distribution across miRNA species\nRed diamonds = group means",
    x = "Treatment",
    y = "Shannon Diversity Index (H')"
  ) +
  theme_classic(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    legend.position = "none"
  )

# Add significance annotation if ANOVA is significant
if(shannon_pval < 0.05) {
  # Get Tukey results for annotation
  tukey_df <- as.data.frame(shannon_tukey$Treatment)
  tukey_df$comparison <- rownames(tukey_df)
  sig_comparisons <- tukey_df[tukey_df$`p adj` < 0.05, ]
  
  if(nrow(sig_comparisons) > 0) {
    p_shannon <- p_shannon +
      annotate("text", x = 2, y = max(diversity_df$Shannon) + 0.1,
               label = sprintf("ANOVA p = %.4f", shannon_pval),
               size = 3.5, fontface = "italic")
  }
}

print(p_shannon)

save_figure(p_shannon, "shannon_diversity_by_treatment", width = 8, height = 6)
## Saved: shannon_diversity_by_treatment .png/.pdf

8.5 Visualization: Multi-Panel Diversity Metrics

# Reshape for faceted plot
diversity_long <- diversity_df %>%
  dplyr::select(Sample, Treatment, Shannon, Simpson, Pielou_Evenness, Gini) %>%
  pivot_longer(cols = c(Shannon, Simpson, Pielou_Evenness, Gini), 
               names_to = "Index", values_to = "Value") %>%
  mutate(Index = factor(Index, 
                        levels = c("Shannon", "Simpson", "Pielou_Evenness", "Gini"),
                        labels = c("Shannon (H')", "Simpson (1-D)", 
                                   "Pielou Evenness (J)", "Gini Coefficient")))

# Create multi-panel plot
p_diversity_panel <- ggplot(diversity_long, aes(x = Treatment, y = Value, fill = Treatment)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
  geom_jitter(width = 0.15, alpha = 0.6, size = 2) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, 
               fill = "red", color = "darkred") +
  facet_wrap(~ Index, scales = "free_y", ncol = 4) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Diversity and Evenness Metrics by Treatment",
    subtitle = "Multiple indices capture different aspects of compositional diversity | Red diamonds = group means",
    x = "Treatment",
    y = "Index Value"
  ) +
  theme_bw(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    strip.text = element_text(face = "bold", size = 11),
    strip.background = element_rect(fill = "grey95"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

print(p_diversity_panel)

save_figure(p_diversity_panel, "diversity_panel_by_treatment", width = 14, height = 5)
## Saved: diversity_panel_by_treatment .png/.pdf

8.6 Effective Number of Species

The effective number of species (Hill number q=1) provides an intuitive interpretation: how many equally-abundant miRNAs would produce the same Shannon diversity?

# Create effective species boxplot
p_effective <- ggplot(diversity_df, aes(x = Treatment, y = Effective_Species, fill = Treatment)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
  geom_jitter(width = 0.15, alpha = 0.7, size = 3) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 4, 
               fill = "red", color = "darkred") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Effective Number of miRNA Species by Treatment",
    subtitle = "Equivalent number of equally-abundant miRNAs (exp of Shannon entropy)",
    x = "Treatment",
    y = "Effective Number of Species"
  ) +
  theme_classic(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    legend.position = "none"
  )

print(p_effective)

save_figure(p_effective, "effective_species_by_treatment", width = 8, height = 6)
## Saved: effective_species_by_treatment .png/.pdf
# Report means
cat("\n=== Effective Number of Species by Treatment ===\n")
## 
## === Effective Number of Species by Treatment ===
diversity_df %>%
  group_by(Treatment) %>%
  summarise(
    Mean = mean(Effective_Species),
    SD = sd(Effective_Species),
    Median = median(Effective_Species),
    .groups = 'drop'
  ) %>%
  print(digits = 2)
## # A tibble: 3 × 4
##   Treatment   Mean    SD Median
##   <fct>      <dbl> <dbl>  <dbl>
## 1 UNSTIM      26.9  29.9   16.8
## 2 HC          35.5  19.3   33.4
## 3 CF_Asp_Neg  35.7  34.3   21.9

8.7 Rank-Abundance Curves (Whittaker Plots)

Rank-abundance curves show how steeply abundance drops off with rank. Steeper curves indicate communities more dominated by a few species; flatter curves indicate more even distributions.

# Calculate mean fractional abundance per miRNA for each treatment
rank_abundance_data <- frac_abundance %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  pivot_longer(-miRNA, names_to = "Sample", values_to = "Fraction") %>%
  mutate(Treatment = sample_mapping[Sample, "comparison_7"]) %>%
  group_by(Treatment, miRNA) %>%
  summarise(Mean_Fraction = mean(Fraction), .groups = "drop") %>%
  group_by(Treatment) %>%
  arrange(desc(Mean_Fraction)) %>%
  mutate(Rank = row_number()) %>%
  ungroup() %>%
  mutate(Treatment = factor(Treatment, levels = c("UNSTIM", "HC", "CF_Asp_Neg")))

# Create rank-abundance plot
p_rank_abundance <- ggplot(rank_abundance_data, 
                           aes(x = Rank, y = Mean_Fraction, color = Treatment)) +
  geom_line(size = 1, alpha = 0.8) +
  scale_y_log10(labels = percent_format(accuracy = 0.01)) +
  scale_x_log10() +
  scale_color_brewer(palette = "Set2") +
  labs(
    title = "Rank-Abundance Curves by Treatment",
    subtitle = "Steeper curves indicate more uneven distributions dominated by few miRNAs",
    x = "Rank (log scale)",
    y = "Mean Fractional Abundance (log scale)",
    color = "Treatment"
  ) +
  theme_classic(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    legend.position = "bottom"
  )

print(p_rank_abundance)

save_figure(p_rank_abundance, "rank_abundance_curves", width = 10, height = 7)
## Saved: rank_abundance_curves .png/.pdf

8.8 Rank-Abundance Curves (Top 100 miRNAs)

# Zoom to top 100 miRNAs
p_rank_abundance_zoom <- ggplot(rank_abundance_data %>% filter(Rank <= 100), 
                                aes(x = Rank, y = Mean_Fraction, color = Treatment)) +
  geom_line(size = 1.2, alpha = 0.8) +
  geom_point(size = 1.5, alpha = 0.6) +
  scale_y_log10(labels = percent_format(accuracy = 0.01)) +
  scale_color_brewer(palette = "Set2") +
  labs(
    title = "Rank-Abundance Curves: Top 100 miRNAs",
    subtitle = "Focused view of most abundant miRNAs across treatment groups",
    x = "Rank",
    y = "Mean Fractional Abundance (log scale)",
    color = "Treatment"
  ) +
  theme_classic(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 10),
    legend.position = "bottom"
  )

print(p_rank_abundance_zoom)

save_figure(p_rank_abundance_zoom, "rank_abundance_curves_top100", width = 10, height = 7)
## Saved: rank_abundance_curves_top100 .png/.pdf

8.8 Diversity Interpretation Summary

cat("\n")
cat("================================================================================\n")
## ================================================================================
cat("                    DIVERSITY ANALYSIS INTERPRETATION                           \n")
##                     DIVERSITY ANALYSIS INTERPRETATION
cat("================================================================================\n\n")
## ================================================================================
# Calculate and report key findings
diversity_means <- diversity_df %>%
  group_by(Treatment) %>%
  summarise(
    Shannon = mean(Shannon),
    Pielou = mean(Pielou_Evenness),
    Gini = mean(Gini),
    Effective = mean(Effective_Species),
    .groups = 'drop'
  )

cat("KEY FINDINGS:\n\n")
## KEY FINDINGS:
cat("1. Shannon Diversity (H'):\n")
## 1. Shannon Diversity (H'):
for(i in 1:nrow(diversity_means)) {
  cat(sprintf("   %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Shannon[i]))
}
##    UNSTIM: 3.021
##    HC: 3.418
##    CF_Asp_Neg: 3.320
cat(sprintf("   Order: %s\n\n", 
            paste(diversity_means$Treatment[order(diversity_means$Shannon, decreasing = TRUE)], 
                  collapse = " > ")))
##    Order: HC > CF_Asp_Neg > UNSTIM
cat("2. Pielou's Evenness (J):\n")
## 2. Pielou's Evenness (J):
for(i in 1:nrow(diversity_means)) {
  cat(sprintf("   %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Pielou[i]))
}
##    UNSTIM: 0.400
##    HC: 0.455
##    CF_Asp_Neg: 0.441
cat(sprintf("   Order: %s\n\n", 
            paste(diversity_means$Treatment[order(diversity_means$Pielou, decreasing = TRUE)], 
                  collapse = " > ")))
##    Order: HC > CF_Asp_Neg > UNSTIM
cat("3. Gini Coefficient (inequality):\n")
## 3. Gini Coefficient (inequality):
for(i in 1:nrow(diversity_means)) {
  cat(sprintf("   %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Gini[i]))
}
##    UNSTIM: 0.955
##    HC: 0.968
##    CF_Asp_Neg: 0.965
cat(sprintf("   Order (most to least concentrated): %s\n\n", 
            paste(diversity_means$Treatment[order(diversity_means$Gini, decreasing = TRUE)], 
                  collapse = " > ")))
##    Order (most to least concentrated): HC > CF_Asp_Neg > UNSTIM
cat("4. Effective Number of Species:\n")
## 4. Effective Number of Species:
for(i in 1:nrow(diversity_means)) {
  cat(sprintf("   %s: %.1f miRNAs\n", diversity_means$Treatment[i], diversity_means$Effective[i]))
}
##    UNSTIM: 26.9 miRNAs
##    HC: 35.5 miRNAs
##    CF_Asp_Neg: 35.7 miRNAs
cat("\n--------------------------------------------------------------------------------\n")
## 
## --------------------------------------------------------------------------------
cat("BIOLOGICAL INTERPRETATION:\n\n")
## BIOLOGICAL INTERPRETATION:
# Determine the pattern
if(diversity_means$Shannon[diversity_means$Treatment == "HC"] > 
   diversity_means$Shannon[diversity_means$Treatment == "CF_Asp_Neg"] &&
   diversity_means$Shannon[diversity_means$Treatment == "CF_Asp_Neg"] > 
   diversity_means$Shannon[diversity_means$Treatment == "UNSTIM"]) {
  
  cat("The pattern HC > CF > UNSTIM for miRNA diversity suggests:\n\n")
  cat("• Exposure to bronchoalveolar lavage stimulates HMSCs to diversify their\n")
  cat("  EV-associated miRNA cargo beyond the baseline (UNSTIM) state.\n\n")
  cat("• Healthy BAL induces the GREATEST diversification, suggesting that healthy\n")
  cat("  lung signals trigger a broad, complex cellular response in HMSCs.\n\n")
  cat("• CF BAL produces an ATTENUATED diversification response—miRNA cargo\n")
  cat("  becomes more diverse than unstimulated cells but less diverse than\n")
  cat("  healthy BAL-exposed cells.\n\n")
  cat("• This 'diversity deficit' in CF-exposed EVs may reflect:\n")
  cat("  (a) Impaired HMSC responsiveness to disease-associated signals\n")
  cat("  (b) A fundamentally different signaling landscape in CF lungs that fails\n")
  cat("      to trigger full cargo diversification\n")
  cat("  (c) Presence of inhibitory factors in CF BAL that constrain EV loading\n\n")
  cat("• THERAPEUTIC IMPLICATION: If therapeutic efficacy requires diverse miRNA\n")
  cat("  cargo for pleiotropic effects, CF-conditioned EVs may be less effective\n")
  cat("  not because they lack specific miRNAs, but because they lack\n")
  cat("  compositional complexity.\n")
  
} else {
  cat("The observed diversity pattern differs from the expected HC > CF > UNSTIM.\n")
  cat("Please examine the summary statistics above to interpret the biological\n")
  cat("significance of the observed pattern.\n")
}
## The pattern HC > CF > UNSTIM for miRNA diversity suggests:
## 
## • Exposure to bronchoalveolar lavage stimulates HMSCs to diversify their
##   EV-associated miRNA cargo beyond the baseline (UNSTIM) state.
## 
## • Healthy BAL induces the GREATEST diversification, suggesting that healthy
##   lung signals trigger a broad, complex cellular response in HMSCs.
## 
## • CF BAL produces an ATTENUATED diversification response—miRNA cargo
##   becomes more diverse than unstimulated cells but less diverse than
##   healthy BAL-exposed cells.
## 
## • This 'diversity deficit' in CF-exposed EVs may reflect:
##   (a) Impaired HMSC responsiveness to disease-associated signals
##   (b) A fundamentally different signaling landscape in CF lungs that fails
##       to trigger full cargo diversification
##   (c) Presence of inhibitory factors in CF BAL that constrain EV loading
## 
## • THERAPEUTIC IMPLICATION: If therapeutic efficacy requires diverse miRNA
##   cargo for pleiotropic effects, CF-conditioned EVs may be less effective
##   not because they lack specific miRNAs, but because they lack
##   compositional complexity.
cat("\n================================================================================\n")
## 
## ================================================================================

9. Export Data for Further Analysis

# Export fractional abundance for major miRNAs
major_frac_export <- frac_abundance[major_mirna_names, ] %>%
  as.data.frame() %>%
  mutate(miRNA = rownames(.)) %>%
  dplyr::select(miRNA, everything())

write.csv(major_frac_export, file.path(output_dir, "major_miRNAs_fractional_abundances.csv"), 
          row.names = FALSE)

# Export all fractional abundances (sorted by mean)
all_frac_export <- frac_abundance %>%
  as.data.frame() %>%
  mutate(
    miRNA = rownames(.),
    Mean_Fraction = rowMeans(.)
  ) %>%
  arrange(desc(Mean_Fraction)) %>%
  dplyr::select(miRNA, Mean_Fraction, everything())

write.csv(all_frac_export, file.path(output_dir, "all_miRNA_fractional_abundances.csv"), 
          row.names = FALSE)

cat("\n=== Data Files Exported ===\n\n")
## 
## === Data Files Exported ===
cat("Compositional Analysis:\n")
## Compositional Analysis:
cat("  1. major_miRNAs_1pct_summary.csv\n")
##   1. major_miRNAs_1pct_summary.csv
cat("  2. major_miRNAs_fractional_abundances.csv\n")
##   2. major_miRNAs_fractional_abundances.csv
cat("  3. all_miRNA_fractional_abundances.csv\n")
##   3. all_miRNA_fractional_abundances.csv
cat("  4. major_miRNAs_ANOVA_results.csv\n")
##   4. major_miRNAs_ANOVA_results.csv
cat("  5. major_miRNAs_posthoc_comparisons.csv (if significant differences found)\n")
##   5. major_miRNAs_posthoc_comparisons.csv (if significant differences found)
cat("  6. major_miRNAs_treatment_summary.csv\n")
##   6. major_miRNAs_treatment_summary.csv
cat("  7. KS_test_results.csv\n\n")
##   7. KS_test_results.csv
cat("Diversity Analysis:\n")
## Diversity Analysis:
cat("  8. diversity_metrics_by_sample.csv\n")
##   8. diversity_metrics_by_sample.csv
cat("  9. diversity_summary_by_treatment.csv\n")
##   9. diversity_summary_by_treatment.csv
cat(" 10. diversity_ANOVA_results.csv\n\n")
##  10. diversity_ANOVA_results.csv
cat("=== Figures Created ===\n\n")
## === Figures Created ===
cat("Compositional Visualizations:\n")
## Compositional Visualizations:
cat("  1.  major_miRNAs_barplot.png/.pdf\n")
##   1.  major_miRNAs_barplot.png/.pdf
cat("  2.  cumulative_abundance_plot.png/.pdf\n")
##   2.  cumulative_abundance_plot.png/.pdf
cat("  3.  composition_stacked_bar.png/.pdf\n")
##   3.  composition_stacked_bar.png/.pdf
cat("  4.  major_miRNAs_heatmap_by_sample.png/.pdf\n")
##   4.  major_miRNAs_heatmap_by_sample.png/.pdf
cat("  5.  major_miRNAs_boxplot_by_group.png/.pdf\n")
##   5.  major_miRNAs_boxplot_by_group.png/.pdf
cat("  6.  major_miRNAs_significant_differences.png/.pdf (if significant)\n")
##   6.  major_miRNAs_significant_differences.png/.pdf (if significant)
cat("  7.  major_miRNAs_stacked_by_sample.png/.pdf\n")
##   7.  major_miRNAs_stacked_by_sample.png/.pdf
cat("  8.  abundance_density_plots.png/.pdf\n")
##   8.  abundance_density_plots.png/.pdf
cat("  9.  ECDF_by_treatment.png/.pdf\n")
##   9.  ECDF_by_treatment.png/.pdf
cat(" 10.  cumulative_abundance_by_treatment.png/.pdf\n\n")
##  10.  cumulative_abundance_by_treatment.png/.pdf
cat("Diversity Visualizations:\n")
## Diversity Visualizations:
cat(" 11.  shannon_diversity_by_treatment.png/.pdf\n")
##  11.  shannon_diversity_by_treatment.png/.pdf
cat(" 12.  diversity_panel_by_treatment.png/.pdf\n")
##  12.  diversity_panel_by_treatment.png/.pdf
cat(" 13.  effective_species_by_treatment.png/.pdf\n")
##  13.  effective_species_by_treatment.png/.pdf
cat(" 14.  rank_abundance_curves.png/.pdf\n")
##  14.  rank_abundance_curves.png/.pdf
cat(" 15.  rank_abundance_curves_top100.png/.pdf\n")
##  15.  rank_abundance_curves_top100.png/.pdf
cat("\n=== All outputs saved to: ===\n")
## 
## === All outputs saved to: ===
cat(output_dir, "\n")
## /Users/d20605c/Documents/MSC-EV-miRNA-CF-BAL/results/01_compositional

10. Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.22.0         AnnotationDbi_1.72.0        qvalue_2.42.0               ReactomePA_1.54.0          
##  [5] enrichplot_1.30.4           clusterProfiler_4.18.4      multiMiR_1.32.0             DESeq2_1.50.2              
##  [9] SummarizedExperiment_1.40.0 Biobase_2.70.0              MatrixGenerics_1.22.0       matrixStats_1.5.0          
## [13] GenomicRanges_1.62.1        Seqinfo_1.0.0               IRanges_2.44.0              S4Vectors_0.48.0           
## [17] BiocGenerics_0.56.0         generics_0.1.4              flextable_0.9.10            officer_0.7.2              
## [21] ggrepel_0.9.6               scales_1.4.0                RColorBrewer_1.1-3          ggplot2_4.0.1              
## [25] tidyr_1.3.2                 readr_2.1.6                 dplyr_1.1.4                 readxl_1.4.5               
## [29] here_1.0.2                 
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.5.2           bitops_1.0-9            ggplotify_0.1.3         tibble_3.3.0            R.oo_1.27.1            
##   [6] cellranger_1.1.0        polyclip_1.10-7         graph_1.88.1            XML_3.99-0.20           lifecycle_1.0.4        
##  [11] rprojroot_2.1.1         lattice_0.22-7          vroom_1.6.7             MASS_7.3-65             magrittr_2.0.4         
##  [16] sass_0.4.10             rmarkdown_2.30          jquerylib_0.1.4         yaml_2.3.12             ggtangle_0.0.9         
##  [21] zip_2.3.3               askpass_1.2.1           cowplot_1.2.0           DBI_1.2.3               abind_1.4-8            
##  [26] purrr_1.2.0             R.utils_2.13.0          ggraph_2.2.2            RCurl_1.98-1.17         yulab.utils_0.2.3      
##  [31] tweenr_2.0.3            rappdirs_0.3.3          gdtools_0.4.4           tidytree_0.4.6          reactome.db_1.94.0     
##  [36] codetools_0.2-20        DelayedArray_0.36.0     DOSE_4.4.0              xml2_1.5.1              ggforce_0.5.0          
##  [41] tidyselect_1.2.1        aplot_0.2.9             farver_2.1.2            viridis_0.6.5           jsonlite_2.0.0         
##  [46] tidygraph_1.3.1         systemfonts_1.3.1       tools_4.5.2             ggnewscale_0.5.2        treeio_1.34.0          
##  [51] ragg_1.5.0              Rcpp_1.1.0              glue_1.8.0              gridExtra_2.3           SparseArray_1.10.8     
##  [56] xfun_0.55               withr_3.0.2             BiocManager_1.30.27     fastmap_1.2.0           openssl_2.3.4          
##  [61] digest_0.6.39           R6_2.6.1                gridGraphics_0.5-1      textshaping_1.0.4       GO.db_3.22.0           
##  [66] RSQLite_2.4.5           R.methodsS3_1.8.2       utf8_1.2.6              fontLiberation_0.1.0    renv_1.1.5             
##  [71] data.table_1.18.0       graphlayouts_1.2.2      httr_1.4.7              htmlwidgets_1.6.4       S4Arrays_1.10.1        
##  [76] scatterpie_0.2.6        graphite_1.56.0         pkgconfig_2.0.3         gtable_0.3.6            blob_1.2.4             
##  [81] S7_0.2.1                XVector_0.50.0          htmltools_0.5.9         fontBitstreamVera_0.1.1 fgsea_1.36.0           
##  [86] png_0.1-8               ggfun_0.2.0             knitr_1.51              tzdb_0.5.0              reshape2_1.4.5         
##  [91] uuid_1.2-1              nlme_3.1-168            cachem_1.1.0            stringr_1.6.0           parallel_4.5.2         
##  [96] pillar_1.11.1           grid_4.5.2              vctrs_0.6.5             tidydr_0.0.6            cluster_2.1.8.1        
## [101] evaluate_1.0.5          cli_3.6.5               locfit_1.5-9.12         compiler_4.5.2          rlang_1.1.6            
## [106] crayon_1.5.3            labeling_0.4.3          plyr_1.8.9              fs_1.6.6                ggiraph_0.9.2          
## [111] stringi_1.8.7           viridisLite_0.4.2       BiocParallel_1.44.0     Biostrings_2.78.0       lazyeval_0.2.2         
## [116] GOSemSim_2.36.0         fontquiver_0.2.1        Matrix_1.7-4            hms_1.1.4               patchwork_1.3.2        
## [121] bit64_4.6.0-1           KEGGREST_1.50.0         igraph_2.2.1            memoise_2.0.1           bslib_0.9.0            
## [126] ggtree_4.0.3            fastmatch_1.1-6         bit_4.6.0               ape_5.8-1               gson_0.1.0

Summary

This compositional and diversity analysis demonstrates that:

  1. 13 miRNAs individually account for ≥1% of total reads, collectively representing 70.4% of all miRNA reads in extracellular vesicles from hMSCs.

  2. The most abundant miRNA is miR-6126, representing 33.17% of total reads.

  3. The remaining 2070 detected miRNAs collectively represent only 29.6% of total reads.

  4. This highly skewed distribution is typical of miRNA populations in extracellular vesicles, where a small number of miRNAs dominate the cargo.

  5. The stacked bar plot (Section 6.6) shows compositional consistency within treatment groups and reveals compositional differences between groups at the individual sample level.

  6. Statistical testing reveals which major miRNAs differ between treatment groups (Section 6.3):

    • One-way ANOVA tests performed for each of the 13 major miRNAs
    • Bonferroni correction applied to control for multiple testing
    • Post-hoc pairwise comparisons (Tukey HSD) identify specific group differences
    • Results show which miRNAs are differentially abundant in response to treatment conditions
  7. Treatment groups show significantly different compositional structures (Section 7):

    • All pairwise Kolmogorov-Smirnov tests are highly significant (p < 0.05)
    • HC (healthy control BAL) exhibits a broader distributional tail, suggesting greater compositional diversity
    • CF_Asp_Neg (CF BAL exposed) is compositionally more similar to UNSTIM than to HC
    • These differences indicate that exposure conditions fundamentally alter miRNA cargo composition
  8. Diversity analysis provides quantitative metrics for compositional complexity (Section 8):

    • Shannon diversity, Simpson’s diversity, Pielou’s evenness, and Gini coefficient calculated per sample
    • Rank-abundance curves visualize the steepness of species abundance decay
    • Effective number of species provides intuitive interpretation of diversity values
    • Statistical testing (ANOVA with Tukey HSD) identifies significant treatment effects on diversity
  9. The analysis identifies not only which specific miRNAs are most abundant, but also which show treatment-dependent changes, providing insight into both the stable “core” miRNA cargo and the treatment-responsive miRNA components in hMSC-derived EVs.

  10. Diversity metrics suggest BAL exposure promotes miRNA cargo diversification, with healthy BAL inducing greater diversity than CF BAL. This “diversity deficit” in CF-exposed EVs may have therapeutic implications if compositional complexity is required for pleiotropic effects.

The visualizations and data tables provide multiple perspectives on compositional structure, focusing on biologically relevant miRNAs that individually contribute ≥1% to the total miRNA population in hMSC-derived extracellular vesicles under different exposure conditions.